import pandas as pd
import yfinance as yf
import numpy as np
import math
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from scipy import stats
from scipy import optimize
import statsmodels.api as sm
from statsmodels.formula.api import ols
from bioinfokit.analys import stat
from scipy.stats import mannwhitneyu
from scipy.stats import wilcoxon
from scipy.stats import kruskal
acoes = ['ITUB3.SA', 'MGLU3.SA', 'CIEL3.SA', 'PETR3.SA', 'CSNA3.SA', '^BVSP']
acoes
['ITUB3.SA', 'MGLU3.SA', 'CIEL3.SA', 'PETR3.SA', 'CSNA3.SA', '^BVSP']
acoes_df = pd.DataFrame()
for acao in acoes:
acoes_df[acao] = yf.download(acao,
start='2012-01-01', end='2023-01-01')['Close']
acoes_df.index = acoes_df.index.strftime('%Y-%m-%d')
[*********************100%***********************] 1 of 1 completed [*********************100%***********************] 1 of 1 completed [*********************100%***********************] 1 of 1 completed [*********************100%***********************] 1 of 1 completed [*********************100%***********************] 1 of 1 completed [*********************100%***********************] 1 of 1 completed
acoes_df.reset_index(inplace=True)
acoes_df
| Date | ITUB3.SA | MGLU3.SA | CIEL3.SA | PETR3.SA | CSNA3.SA | ^BVSP | |
|---|---|---|---|---|---|---|---|
| 0 | 2012-01-02 | 12.845206 | 0.290625 | 9.715390 | 23.209999 | 15.11 | NaN |
| 1 | 2012-01-03 | 13.072877 | 0.288750 | 9.751559 | 24.020000 | 15.89 | 59265.0 |
| 2 | 2012-01-04 | 13.195819 | 0.285937 | 9.785718 | 24.170000 | 16.00 | 59365.0 |
| 3 | 2012-01-05 | 12.990915 | 0.283437 | 9.645061 | 24.020000 | 15.87 | 58546.0 |
| 4 | 2012-01-06 | 13.182159 | 0.291250 | 9.584780 | 24.000000 | 15.83 | 58600.0 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 2724 | 2022-12-23 | 21.860001 | 2.680000 | 5.180000 | 28.559999 | 13.91 | 109698.0 |
| 2725 | 2022-12-26 | 21.580000 | 2.660000 | 5.130000 | 28.469999 | 14.14 | 108738.0 |
| 2726 | 2022-12-27 | 21.379999 | 2.520000 | 5.080000 | 28.660000 | 14.46 | 108347.0 |
| 2727 | 2022-12-28 | 21.930000 | 2.690000 | 5.210000 | 28.500000 | 14.79 | 110237.0 |
| 2728 | 2022-12-29 | 21.889999 | 2.740000 | 5.240000 | 28.040001 | 14.55 | 110031.0 |
2729 rows × 7 columns
dataset = acoes_df.copy()
dataset.drop(labels = ['Date'], axis=1, inplace=True)
taxas_retorno = np.log(dataset / dataset.shift(1))
dataset_date = acoes_df.copy()
date = dataset_date.filter(["Date"])
taxas_retorno = pd.concat([date, taxas_retorno], axis=1)
taxas_retorno
| Date | ITUB3.SA | MGLU3.SA | CIEL3.SA | PETR3.SA | CSNA3.SA | ^BVSP | |
|---|---|---|---|---|---|---|---|
| 0 | 2012-01-02 | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 2012-01-03 | 0.017569 | -0.006473 | 0.003716 | 0.034304 | 0.050333 | NaN |
| 2 | 2012-01-04 | 0.009360 | -0.009790 | 0.003497 | 0.006225 | 0.006899 | 0.001686 |
| 3 | 2012-01-05 | -0.015650 | -0.008782 | -0.014478 | -0.006225 | -0.008158 | -0.013892 |
| 4 | 2012-01-06 | 0.014614 | 0.027192 | -0.006270 | -0.000833 | -0.002524 | 0.000922 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 2724 | 2022-12-23 | 0.013355 | 0.030305 | 0.021464 | 0.049894 | -0.022744 | 0.021944 |
| 2725 | 2022-12-26 | -0.012892 | -0.007491 | -0.009699 | -0.003156 | 0.016400 | -0.008790 |
| 2726 | 2022-12-27 | -0.009311 | -0.054067 | -0.009794 | 0.006652 | 0.022379 | -0.003602 |
| 2727 | 2022-12-28 | 0.025400 | 0.065282 | 0.025269 | -0.005598 | 0.022565 | 0.017294 |
| 2728 | 2022-12-29 | -0.001826 | 0.018417 | 0.005742 | -0.016272 | -0.016360 | -0.001870 |
2729 rows × 7 columns
figura = px.line(title = 'Histórico de retorno das ações')
for i in taxas_retorno.columns[1:]:
figura.add_scatter(x = taxas_retorno["Date"] ,y = taxas_retorno[i], name = i)
figura.show()
acoes_port = acoes_df.copy()
acoes_port.drop(labels = ['^BVSP'], axis=1, inplace=True)
log_ret = acoes_port.copy()
log_ret.drop(labels = ["Date"], axis = 1, inplace = True)
log_ret = np.log(log_ret/log_ret.shift(1))
np.random.seed(42)
num_ports = 10000
all_weights = np.zeros((num_ports, len(acoes_port.columns[1:])))
ret_arr = np.zeros(num_ports)
vol_arr = np.zeros(num_ports)
sharpe_arr = np.zeros(num_ports)
for x in range(num_ports):
# Weights
weights = np.array(np.random.random(5))
weights = weights/np.sum(weights)
# Save weights
all_weights[x,:] = weights
# Expected return
ret_arr[x] = np.sum((log_ret.mean() * weights))
# Expected volatility
vol_arr[x] = np.sqrt(np.dot(weights.T, np.dot(log_ret.cov(), weights)))
# Sharpe Ratio
sharpe_arr[x] = ret_arr[x]/vol_arr[x]
print("Max Sharpe Ratio: {}". format(sharpe_arr.max()))
print("Local do Max Sharpe Ratio: {}". format(sharpe_arr.argmax()))
Max Sharpe Ratio: 0.021130418245689463 Local do Max Sharpe Ratio: 7063
# Pesos do Portfólio do Max Sharpe Ratio
print(all_weights[7063,:])
[0.32020505 0.62806883 0.01391511 0.01269245 0.02511856]
taxas_retorno["CARTEIRA"] = (0.32020505*taxas_retorno["ITUB3.SA"] + 0.62806883*taxas_retorno["MGLU3.SA"] +
0.01391511*taxas_retorno["CIEL3.SA"] + 0.01269245*taxas_retorno["PETR3.SA"] +
0.02511856*taxas_retorno["CSNA3.SA"])
taxas_retorno
| Date | ITUB3.SA | MGLU3.SA | CIEL3.SA | PETR3.SA | CSNA3.SA | ^BVSP | CARTEIRA | |
|---|---|---|---|---|---|---|---|---|
| 0 | 2012-01-02 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 2012-01-03 | 0.017569 | -0.006473 | 0.003716 | 0.034304 | 0.050333 | NaN | 0.003312 |
| 2 | 2012-01-04 | 0.009360 | -0.009790 | 0.003497 | 0.006225 | 0.006899 | 0.001686 | -0.002850 |
| 3 | 2012-01-05 | -0.015650 | -0.008782 | -0.014478 | -0.006225 | -0.008158 | -0.013892 | -0.011012 |
| 4 | 2012-01-06 | 0.014614 | 0.027192 | -0.006270 | -0.000833 | -0.002524 | 0.000922 | 0.021597 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2724 | 2022-12-23 | 0.013355 | 0.030305 | 0.021464 | 0.049894 | -0.022744 | 0.021944 | 0.023671 |
| 2725 | 2022-12-26 | -0.012892 | -0.007491 | -0.009699 | -0.003156 | 0.016400 | -0.008790 | -0.008596 |
| 2726 | 2022-12-27 | -0.009311 | -0.054067 | -0.009794 | 0.006652 | 0.022379 | -0.003602 | -0.036429 |
| 2727 | 2022-12-28 | 0.025400 | 0.065282 | 0.025269 | -0.005598 | 0.022565 | 0.017294 | 0.049982 |
| 2728 | 2022-12-29 | -0.001826 | 0.018417 | 0.005742 | -0.016272 | -0.016360 | -0.001870 | 0.010445 |
2729 rows × 8 columns
acoes_port = acoes_df.copy()
acoes_port.drop(labels = ['^BVSP'], axis=1, inplace=True)
log_ret = acoes_port.copy()
log_ret.drop(labels = ["Date"], axis = 1, inplace = True)
log_ret = np.log(log_ret/log_ret.shift(1))
np.random.seed(123)
num_ports = 10000
all_weights = np.zeros((num_ports, len(acoes_port.columns[1:])))
ret_arr = np.zeros(num_ports)
vol_arr = np.zeros(num_ports)
sharpe_arr = np.zeros(num_ports)
for x in range(num_ports):
# Weights
weights = np.array(np.random.random(5))
weights = weights/np.sum(weights)
# Save weights
all_weights[x,:] = weights
# Expected return
ret_arr[x] = np.sum((log_ret.mean() * weights))
# Expected volatility
vol_arr[x] = np.sqrt(np.dot(weights.T, np.dot(log_ret.cov(), weights)))
# Sharpe Ratio
sharpe_arr[x] = ret_arr[x]/vol_arr[x]
print("Max Sharpe Ratio: {}". format(sharpe_arr.max()))
print("Local do Max Sharpe Ratio: {}". format(sharpe_arr.argmax()))
Max Sharpe Ratio: 0.02089881321285652 Local do Max Sharpe Ratio: 7547
# Pesos do Portfólio do Max Sharpe Ratio
print(all_weights[7547,:])
[0.27851125 0.6314676 0.01505158 0.06325824 0.01171134]
taxas_retorno["CARTEIRA_NOVA"] = (0.27851125*taxas_retorno["ITUB3.SA"] + 0.6314676*taxas_retorno["MGLU3.SA"] +
0.01505158*taxas_retorno["CIEL3.SA"] + 0.06325824*taxas_retorno["PETR3.SA"] +
0.01171134*taxas_retorno["CSNA3.SA"])
taxas_retorno
| Date | ITUB3.SA | MGLU3.SA | CIEL3.SA | PETR3.SA | CSNA3.SA | ^BVSP | CARTEIRA | CARTEIRA_NOVA | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 2012-01-02 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 2012-01-03 | 0.017569 | -0.006473 | 0.003716 | 0.034304 | 0.050333 | NaN | 0.003312 | 0.003621 |
| 2 | 2012-01-04 | 0.009360 | -0.009790 | 0.003497 | 0.006225 | 0.006899 | 0.001686 | -0.002850 | -0.003048 |
| 3 | 2012-01-05 | -0.015650 | -0.008782 | -0.014478 | -0.006225 | -0.008158 | -0.013892 | -0.011012 | -0.010611 |
| 4 | 2012-01-06 | 0.014614 | 0.027192 | -0.006270 | -0.000833 | -0.002524 | 0.000922 | 0.021597 | 0.021064 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2724 | 2022-12-23 | 0.013355 | 0.030305 | 0.021464 | 0.049894 | -0.022744 | 0.021944 | 0.023671 | 0.026069 |
| 2725 | 2022-12-26 | -0.012892 | -0.007491 | -0.009699 | -0.003156 | 0.016400 | -0.008790 | -0.008596 | -0.008474 |
| 2726 | 2022-12-27 | -0.009311 | -0.054067 | -0.009794 | 0.006652 | 0.022379 | -0.003602 | -0.036429 | -0.036200 |
| 2727 | 2022-12-28 | 0.025400 | 0.065282 | 0.025269 | -0.005598 | 0.022565 | 0.017294 | 0.049982 | 0.048588 |
| 2728 | 2022-12-29 | -0.001826 | 0.018417 | 0.005742 | -0.016272 | -0.016360 | -0.001870 | 0.010445 | 0.009987 |
2729 rows × 9 columns
taxas_retorno_port = taxas_retorno.filter(["Date", "CARTEIRA", "CARTEIRA_NOVA", "^BVSP"])
taxas_retorno_port
| Date | CARTEIRA | CARTEIRA_NOVA | ^BVSP | |
|---|---|---|---|---|
| 0 | 2012-01-02 | NaN | NaN | NaN |
| 1 | 2012-01-03 | 0.003312 | 0.003621 | NaN |
| 2 | 2012-01-04 | -0.002850 | -0.003048 | 0.001686 |
| 3 | 2012-01-05 | -0.011012 | -0.010611 | -0.013892 |
| 4 | 2012-01-06 | 0.021597 | 0.021064 | 0.000922 |
| ... | ... | ... | ... | ... |
| 2724 | 2022-12-23 | 0.023671 | 0.026069 | 0.021944 |
| 2725 | 2022-12-26 | -0.008596 | -0.008474 | -0.008790 |
| 2726 | 2022-12-27 | -0.036429 | -0.036200 | -0.003602 |
| 2727 | 2022-12-28 | 0.049982 | 0.048588 | 0.017294 |
| 2728 | 2022-12-29 | 0.010445 | 0.009987 | -0.001870 |
2729 rows × 4 columns
figura = px.line(title = 'Comparação de retorno Carteira x Ibovespa')
for i in taxas_retorno_port.columns[1:]:
figura.add_scatter(x = taxas_retorno_port["Date"] ,y = taxas_retorno_port[i], name = i)
figura.add_hline(y = taxas_retorno_port['CARTEIRA'].mean(), line_color="blue", line_dash="dot", )
figura.add_hline(y = taxas_retorno_port['CARTEIRA_NOVA'].mean(), line_color="red", line_dash="dot", )
figura.show()
taxa_port = taxas_retorno_port.copy()
taxa_port.drop(labels = ['Date'], axis=1, inplace=True)
taxa_port = taxa_port.dropna()
taxa_port
| CARTEIRA | CARTEIRA_NOVA | ^BVSP | |
|---|---|---|---|
| 2 | -0.002850 | -0.003048 | 0.001686 |
| 3 | -0.011012 | -0.010611 | -0.013892 |
| 4 | 0.021597 | 0.021064 | 0.000922 |
| 5 | -0.005678 | -0.004720 | 0.008209 |
| 6 | -0.005269 | -0.005444 | 0.012163 |
| ... | ... | ... | ... |
| 2724 | 0.023671 | 0.026069 | 0.021944 |
| 2725 | -0.008596 | -0.008474 | -0.008790 |
| 2726 | -0.036429 | -0.036200 | -0.003602 |
| 2727 | 0.049982 | 0.048588 | 0.017294 |
| 2728 | 0.010445 | 0.009987 | -0.001870 |
2711 rows × 3 columns
plt.figure(figsize=(12,8))
sns.boxplot(x="variable", y="value", data=pd.melt(taxa_port))
plt.show()
stats.probplot(taxas_retorno_port["CARTEIRA"], dist="norm", plot=plt)
plt.show()
stats.probplot(taxas_retorno_port["CARTEIRA_NOVA"], dist="norm", plot=plt)
plt.show()
stats.probplot(taxas_retorno_port["^BVSP"], dist="norm", plot=plt)
plt.show()
# Ordinary Least Squares (OLS) model
model = ols('value ~ variable', data=pd.melt(taxa_port)).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
anova_table
| sum_sq | df | F | PR(>F) | |
|---|---|---|---|---|
| variable | 0.000264 | 2.0 | 0.226356 | 0.797439 |
| Residual | 4.748675 | 8130.0 | NaN | NaN |
res = stat()
res.anova_stat(pd.melt(taxa_port), res_var='value', anova_model='value ~ variable')
res.anova_summary
| df | sum_sq | mean_sq | F | PR(>F) | |
|---|---|---|---|---|---|
| variable | 2.0 | 0.000264 | 0.000132 | 0.226356 | 0.797439 |
| Residual | 8130.0 | 4.748675 | 0.000584 | NaN | NaN |
res = stat()
res.tukey_hsd(pd.melt(taxa_port), res_var='value', xfac_var='variable', anova_model='value ~ variable')
res.tukey_summary
/home/joaogabrielsouza/anaconda3/lib/python3.10/site-packages/bioinfokit/analys.py:402: FutureWarning: The default value of numeric_only in DataFrame.mean is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning. /home/joaogabrielsouza/anaconda3/lib/python3.10/site-packages/bioinfokit/analys.py:402: FutureWarning: The default value of numeric_only in DataFrame.mean is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning. /home/joaogabrielsouza/anaconda3/lib/python3.10/site-packages/bioinfokit/analys.py:402: FutureWarning: The default value of numeric_only in DataFrame.mean is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning.
| group1 | group2 | Diff | Lower | Upper | q-value | p-value | |
|---|---|---|---|---|---|---|---|
| 0 | CARTEIRA | CARTEIRA_NOVA | 0.000001 | -0.001538 | 0.001540 | 0.002530 | 0.900000 |
| 1 | CARTEIRA | ^BVSP | 0.000383 | -0.001156 | 0.001922 | 0.825318 | 0.809700 |
| 2 | CARTEIRA_NOVA | ^BVSP | 0.000382 | -0.001157 | 0.001921 | 0.822787 | 0.810724 |
sm.qqplot(res.anova_std_residuals, line='45')
plt.xlabel("Theoretical Quantiles")
plt.ylabel("Standardized Residuals")
plt.show()
plt.hist(res.anova_model_out.resid, bins='auto', histtype='bar', ec='k')
plt.xlabel("Residuals")
plt.ylabel('Frequency')
plt.show()
w, pvalue = stats.shapiro(model.resid)
print('Statistics= %.3f, pvalue=%.3f' % (w, pvalue))
# interpret
alpha = 0.05
if pvalue > alpha:
print('Resíduo com Distribuição Normal (Não rejeita H0)')
else:
print('Diferente da Normal (rejeita H0)')
Statistics= 0.919, pvalue=0.000 Diferente da Normal (rejeita H0)
/home/joaogabrielsouza/anaconda3/lib/python3.10/site-packages/scipy/stats/_morestats.py:1816: UserWarning: p-value may not be accurate for N > 5000.
from scipy import stats
# perform the Kolmogorov-Smirnov test on the residuals
k_statistic, pvalue = stats.kstest(model.resid, 'norm')
# print the results and interpret
alpha = 0.05
if pvalue > alpha:
print('Resíduo com Distribuição Normal (Não rejeita H0)')
else:
print('Diferente da Normal (rejeita H0)')
print('K-S Statistic= %.3f, pvalue=%.3f' % (k_statistic, pvalue))
Diferente da Normal (rejeita H0) K-S Statistic= 0.465, pvalue=0.000
w, pvalue = stats.bartlett(taxa_port['CARTEIRA'], taxa_port['^BVSP'])
print('Statistics= %.3f, pvalue=%.3f' % (w, pvalue))
# interpret
alpha = 0.05
if pvalue > alpha:
print('Same Variance (Não rejeita H0)')
else:
print('Different Variance (rejeita H0)')
Statistics= 778.219, pvalue=0.000 Different Variance (rejeita H0)
res = stat()
res.bartlett(pd.melt(taxa_port), res_var='value', xfac_var='variable')
res.bartlett_summary
| Parameter | Value | |
|---|---|---|
| 0 | Test statistics (T) | 944.5239 |
| 1 | Degrees of freedom (Df) | 2.0000 |
| 2 | p value | 0.0000 |
res = stat()
res.levene(pd.melt(taxa_port), res_var='value', xfac_var='variable')
res.levene_summary
| Parameter | Value | |
|---|---|---|
| 0 | Test statistics (W) | 198.9781 |
| 1 | Degrees of freedom (Df) | 2.0000 |
| 2 | p value | 0.0000 |
stats, p = mannwhitneyu(taxa_port["CARTEIRA"], taxa_port["^BVSP"])
print('Statistics= %.3f, p=%.3f' % (stats, p))
# interpret
alpha = 0.05
if p > alpha:
print('Same distribution (Não rejeita H0)')
else:
print('Different distribution (rejeita H0)')
Statistics= 3672252.000, p=0.965 Same distribution (Não rejeita H0)
stats, p = mannwhitneyu(taxa_port["CARTEIRA_NOVA"], taxa_port["^BVSP"])
print('Statistics= %.3f, p=%.3f' % (stats, p))
# interpret
alpha = 0.05
if p > alpha:
print('Same distribution (Não rejeita H0)')
else:
print('Different distribution (rejeita H0)')
Statistics= 3675220.000, p=0.994 Same distribution (Não rejeita H0)
stats, p = mannwhitneyu(taxa_port["CARTEIRA_NOVA"], taxa_port["CARTEIRA"])
print('Statistics= %.3f, p=%.3f' % (stats, p))
# interpret
alpha = 0.05
if p > alpha:
print('Same distribution (Não rejeita H0)')
else:
print('Different distribution (rejeita H0)')
Statistics= 3676389.000, p=0.977 Same distribution (Não rejeita H0)
stats, p = wilcoxon(taxa_port["CARTEIRA"], taxa_port["^BVSP"])
print('Statistics= %.3f, p=%.3f' % (stats, p))
# interpret
alpha = 0.05
if p > alpha:
print('Same distribution (Não rejeita H0)')
else:
print('Different distribution (rejeita H0)')
Statistics= 1817344.000, p=0.635 Same distribution (Não rejeita H0)
stats, p = wilcoxon(taxa_port["CARTEIRA_NOVA"], taxa_port["^BVSP"])
print('Statistics= %.3f, p=%.3f' % (stats, p))
# interpret
alpha = 0.05
if p > alpha:
print('Same distribution (Não rejeita H0)')
else:
print('Different distribution (rejeita H0)')
Statistics= 1815236.000, p=0.598 Same distribution (Não rejeita H0)
stats, p = wilcoxon(taxa_port["CARTEIRA_NOVA"], taxa_port["CARTEIRA"])
print('Statistics= %.3f, p=%.3f' % (stats, p))
# interpret
alpha = 0.05
if p > alpha:
print('Same distribution (Não rejeita H0)')
else:
print('Different distribution (rejeita H0)')
Statistics= 1820993.000, p=0.775 Same distribution (Não rejeita H0)
stats, p = kruskal(taxa_port["CARTEIRA_NOVA"], taxa_port["CARTEIRA"], taxa_port["^BVSP"])
print('Statistics= %.3f, p=%.3f' % (stats, p))
# interpret
alpha = 0.05
if p > alpha:
print('Same distribution (Não rejeita H0)')
else:
print('Different distribution (rejeita H0)')
Statistics= 0.002, p=0.999 Same distribution (Não rejeita H0)